## Loading required package: limma
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, append,
## as.data.frame, basename, cbind, colnames, dirname, do.call,
## duplicated, eval, evalq, get, grep, grepl, intersect,
## is.unsorted, lapply, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, rank, rbind, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unsplit, which,
## which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
## -- Attaching packages -------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.2
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------- tidyverse_conflicts() --
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Paired-end sequencing was performed on primary cultures from parathyroid tumors of 4 patients at 2 time points over 3 conditions (control, treatment with diarylpropionitrile (DPN) and treatment with 4-hydroxytamoxifen (OHT)). DPN is a selective estrogen receptor agonist and OHT is a selective estrogen receptor modulator. One sample (patient 4, 24 hours, control) was omitted by the paper authors due to low quality. Data, the count table and information on the experiment is available at http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE37211.
file<-"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE37211&format=file&file=GSE37211%5Fcount%5Ftable%2Etxt%2Egz"
countTable<-file %>% url %>% gzcon %>% readLines %>% textConnection %>% read.table(.,sep="\t",header=TRUE)
head(countTable)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
## ENSG00000000003 673 916 391 830 474 734 370 325 1140 419 1124 393 293 356
## ENSG00000000005 4 1 2 3 2 1 0 1 0 0 0 0 0 0
## ENSG00000000419 239 211 132 209 142 170 216 154 465 196 514 203 203 216
## ENSG00000000457 144 181 88 145 73 107 214 165 496 174 522 204 117 114
## ENSG00000000460 364 188 196 200 213 162 535 184 1461 256 1531 224 181 96
## ENSG00000000938 3 7 2 5 1 3 12 23 41 13 33 12 6 3
## X15 X16 X17 X18 X20 X21 X22 X23 X24
## ENSG00000000003 537 621 787 321 396 372 666 334 674
## ENSG00000000005 0 0 1 0 0 0 0 0 0
## ENSG00000000419 309 355 466 222 211 201 308 176 402
## ENSG00000000457 180 242 315 126 165 141 251 134 297
## ENSG00000000460 358 271 699 159 86 314 219 297 323
## ENSG00000000938 9 17 14 6 6 7 17 8 16
Get all info from GEO. get sample info via getGEO (info from samples)
gse<-getGEO("GSE37211")
## Found 1 file(s)
## GSE37211_series_matrix.txt.gz
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
## File stored at:
## /var/folders/p1/3js9hvbs473g1klcmqm0d8wm0000gn/T//RtmpBBANsd/GPL11154.soft
pdata<-pData(gse[[1]])
target<-pdata[,grep(colnames(pdata),pattern=":ch1")]
target
## agent:ch1 patient:ch1 time:ch1 tissue:ch1
## GSM913873 Control 1 24h Parathyroid tumor
## GSM913874 Control 1 48h Parathyroid tumor
## GSM913875 DPN 1 24h Parathyroid tumor
## GSM913876 DPN 1 48h Parathyroid tumor
## GSM913877 OHT 1 24h Parathyroid tumor
## GSM913878 OHT 1 48h Parathyroid tumor
## GSM913879 Control 2 24h Parathyroid tumor
## GSM913880 Control 2 48h Parathyroid tumor
## GSM913881 DPN 2 24h Parathyroid tumor
## GSM913882 DPN 2 48h Parathyroid tumor
## GSM913883 OHT 2 24h Parathyroid tumor
## GSM913884 OHT 2 48h Parathyroid tumor
## GSM913885 Control 3 24h Parathyroid tumor
## GSM913886 Control 3 48h Parathyroid tumor
## GSM913887 DPN 3 24h Parathyroid tumor
## GSM913888 DPN 3 48h Parathyroid tumor
## GSM913889 OHT 3 24h Parathyroid tumor
## GSM913890 OHT 3 48h Parathyroid tumor
## GSM913891 Control 4 48h Parathyroid tumor
## GSM913892 DPN 4 24h Parathyroid tumor
## GSM913893 DPN 4 48h Parathyroid tumor
## GSM913894 OHT 4 24h Parathyroid tumor
## GSM913895 OHT 4 48h Parathyroid tumor
colnames(target)<-sub(":ch1","",colnames(target))
target<-apply(target,2,as.factor) %>% as.data.frame
dge <- DGEList(counts=countTable)
dge$sample
## group lib.size norm.factors
## X1 1 6940755 1
## X2 1 8180762 1
## X3 1 4176780 1
## X4 1 7412702 1
## X5 1 4537507 1
## X6 1 6115870 1
## X7 1 6473767 1
## X8 1 5264925 1
## X9 1 16470386 1
## X10 1 6302823 1
## X11 1 16146909 1
## X12 1 6230264 1
## X13 1 5881155 1
## X14 1 6452338 1
## X15 1 9613620 1
## X16 1 12316913 1
## X17 1 16944429 1
## X18 1 6221756 1
## X20 1 6176208 1
## X21 1 5756955 1
## X22 1 10392039 1
## X23 1 5029131 1
## X24 1 11935997 1
There can be an effect of agent, time interaction and agent x time interaction. We also expect blocking for patient. We can assess all effects of interest within patient.
design <- model.matrix(~agent*time+patient,target)
rownames(design) = colnames(dge)
design
## (Intercept) agentDPN agentOHT time48h patient2 patient3 patient4
## X1 1 0 0 0 0 0 0
## X2 1 0 0 1 0 0 0
## X3 1 1 0 0 0 0 0
## X4 1 1 0 1 0 0 0
## X5 1 0 1 0 0 0 0
## X6 1 0 1 1 0 0 0
## X7 1 0 0 0 1 0 0
## X8 1 0 0 1 1 0 0
## X9 1 1 0 0 1 0 0
## X10 1 1 0 1 1 0 0
## X11 1 0 1 0 1 0 0
## X12 1 0 1 1 1 0 0
## X13 1 0 0 0 0 1 0
## X14 1 0 0 1 0 1 0
## X15 1 1 0 0 0 1 0
## X16 1 1 0 1 0 1 0
## X17 1 0 1 0 0 1 0
## X18 1 0 1 1 0 1 0
## X20 1 0 0 1 0 0 1
## X21 1 1 0 0 0 0 1
## X22 1 1 0 1 0 0 1
## X23 1 0 1 0 0 0 1
## X24 1 0 1 1 0 0 1
## agentDPN:time48h agentOHT:time48h
## X1 0 0
## X2 0 0
## X3 0 0
## X4 1 0
## X5 0 0
## X6 0 1
## X7 0 0
## X8 0 0
## X9 0 0
## X10 1 0
## X11 0 0
## X12 0 1
## X13 0 0
## X14 0 0
## X15 0 0
## X16 1 0
## X17 0 0
## X18 0 1
## X20 0 0
## X21 0 0
## X22 1 0
## X23 0 0
## X24 0 1
## attr(,"assign")
## [1] 0 1 1 2 3 3 3 4 4
## attr(,"contrasts")
## attr(,"contrasts")$agent
## [1] "contr.treatment"
##
## attr(,"contrasts")$time
## [1] "contr.treatment"
##
## attr(,"contrasts")$patient
## [1] "contr.treatment"
keep <- filterByExpr(dge, design)
dge <- dge[keep, , keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge)
dge$samples
## group lib.size norm.factors
## X1 1 6928344 0.9991826
## X2 1 8166370 0.9871519
## X3 1 4169707 0.9777048
## X4 1 7399280 1.0056375
## X5 1 4529364 0.9772584
## X6 1 6105569 0.9891050
## X7 1 6462881 0.9677161
## X8 1 5256022 0.9647628
## X9 1 16438721 1.0043510
## X10 1 6292354 0.9620275
## X11 1 16116791 0.9946148
## X12 1 6219897 0.9599718
## X13 1 5868039 1.0277969
## X14 1 6437868 1.0050856
## X15 1 9592029 1.0358842
## X16 1 12288733 1.0146357
## X17 1 16904257 1.0421481
## X18 1 6208329 0.9919888
## X20 1 6163365 1.0108964
## X21 1 5745207 1.0209375
## X22 1 10369491 1.0284136
## X23 1 5018532 1.0154103
## X24 1 11909919 1.0238401
An MDS plot shows the leading fold changes (differential expression) between the 23 samples.
plotMDS(dge,labels=paste(target$agent,target$time,target$patient,sep="-"),col=as.double(target$agent))
There is a very strong patient effect!
dge <- estimateDisp(dge, design, robust=TRUE)
plotBCV(dge)
fit <- glmFit(dge,design)
L <- matrix(0,ncol=ncol(design),nrow=9)
colnames(L)<-colnames(design)
L[1,"agentDPN"]<-1
L[2,"agentOHT"]<-1
L[3,]<-L[2,]-L[1,]
L[4,grep(colnames(L),pattern="DPN")]<-1
L[5,grep(colnames(L),pattern="OHT")]<-1
L[6,]<-L[5,]-L[4,]
L[7,"agentDPN:time48h"]<-1
L[8,"agentOHT:time48h"]<-1
L[9,]<-L[8,]-L[7,]
rownames(L)<-c(paste("early",c("DPN-C","OHT-C","OHT-DPN")),paste("late",c("DPN-C","OHT-C","OHT-DPN")),paste("inter",c("DPN-C","OHT-C","OHT-DPN")))
L
## (Intercept) agentDPN agentOHT time48h patient2 patient3
## early DPN-C 0 1 0 0 0 0
## early OHT-C 0 0 1 0 0 0
## early OHT-DPN 0 -1 1 0 0 0
## late DPN-C 0 1 0 0 0 0
## late OHT-C 0 0 1 0 0 0
## late OHT-DPN 0 -1 1 0 0 0
## inter DPN-C 0 0 0 0 0 0
## inter OHT-C 0 0 0 0 0 0
## inter OHT-DPN 0 0 0 0 0 0
## patient4 agentDPN:time48h agentOHT:time48h
## early DPN-C 0 0 0
## early OHT-C 0 0 0
## early OHT-DPN 0 0 0
## late DPN-C 0 1 0
## late OHT-C 0 0 1
## late OHT-DPN 0 -1 1
## inter DPN-C 0 1 0
## inter OHT-C 0 0 1
## inter OHT-DPN 0 -1 1
tests<-apply(L,1,function(fit,contrast) glmLRT(fit,contrast=contrast),fit=fit)
topTagsAll<-lapply(tests,topTags,n=nrow(dge))
dt<-lapply(tests,decideTests)
for (i in 1:nrow(L))
{
hist(topTagsAll[[i]]$table$PValue,main=paste("contrast",names(topTagsAll)[i]),xlab="p-value")
}
The histograms show strange p-value distributions for some of the contrasts, e.g. interactions. It indicates that the null distribution seems to be too broad.
for (i in 1:nrow(L))
{
volcano<- ggplot(topTagsAll[[i]]$table,aes(x=logFC,y=-log10(PValue),color=FDR<0.05)) + geom_point() + scale_color_manual(values=c("black","red")) + ggtitle(paste("contrast",names(topTagsAll)[i]))
print(volcano)
}
for (i in 1:nrow(L))
{
plotSmear(tests[[i]],de.tags=rownames(dge)[as.logical(dt[[i]])],main=paste("contrast",names(tests)[i]))
}
for (i in 1:nrow(L))
{
if(sum(as.logical(dt[[i]]))>=2)
print(pheatmap(cpm(dge,log=TRUE)[as.logical(dt[[i]]),],labels_col = paste(target$agent,target$time,target$patient,sep="-"),main=paste("contrast",names(dt)[i])))
}